# Data preperation: pathwaysPrevFit9.Rdata

# the data file "Norwegian Citizen Panel - wave 1-9 NO.sav" is available at:
# https://nsd.no/nsddata/serier/norsk_medborgerpanel_eng.html

# NB!
# The textProcessor was updated after we ran the analyses (in December 2018)
# The following codes produce results, with updated textProcessor, that are sligtly different from the article. 
# The processed text that we analyze the article includes 2945 documents and 596 terms (in "vocab")
# The following codes produces processed text for 2944 documents and 598 (terms in "vocab")

# load packages
library(foreign)
library(stm)

# clear all
rm(list=ls(all=TRUE))

# read in your data that is in a spreadsheet form .csv file here
setwd("P:/2017-pathways/new/2-data")
rawdata <- read.csv2("pathways-text-data.csv", header=TRUE, sep=";", stringsAsFactor=FALSE)

# make the open answers text not factor
rawdata$treatment <- 0 
rawdata$treatment[rawdata$r7km1a!=""] <- 1
rawdata$treatment[rawdata$r7km1b!=""] <- 2
rawdata$treatment[rawdata$r7km1c!=""] <- 3
rawdata$treatment[rawdata$treatment==0] <- NA
rawdata$treatment <- as.factor(rawdata$treatment)

# add treatment
data <- rawdata
rm(rawdata)
data$openanswer <- paste(data$r7km1a, data$r7km1b, data$r7km1c)
length(data$openanswer)
data$openanswer.orig <- data$openanswer

# merge with background variables
ncp <- read.spss(file="Norwegian Citizen Panel - wave 1-9 NO.sav",
                 use.value.labels=FALSE,
                 to.data.frame=TRUE,
                 trim.factor.names=TRUE)
myvars<-c("responseid", "r7P1", "r7P2", "r7P3", "r7P4_1", "r7P4_2", "r7P5_1", 
          "r8B2_2", "r7municipalSize", "w01_k24", "w03_r3k24", "r4k24", "r5km11")
numdata<-ncp[myvars]
numdata$serial<-as.integer(numdata$responseid)
data<-merge(data, numdata, 
            by.x="responseid",
            by.y="responseid",
            all.x=TRUE,
            all.y=TRUE
)
rm(ncp, numdata)

# rename income and age varaibles
data$income<-data$r8B2_2
data$income[data$income==97]<-NA
data$income[data$income==98]<-NA
data$age<-data$r7P5_1

# recode background variables as dummy variables
data$male[data$r7P1==1]<-1
data$male[data$r7P1==2]<-0
data$female[data$r7P1==2]<-1
data$female[data$r7P1==1]<-0
data$urban[data$r7municipalSize==4]<-1
data$urban[data$r7municipalSize==1]<-0
data$urban[data$r7municipalSize==2]<-0
data$urban[data$r7municipalSize==3]<-0
data$oslo[data$r7P2==1]<-1
data$oslo[data$r7P2==2]<-0
data$oslo[data$r7P2==3]<-0
data$oslo[data$r7P2==4]<-0
data$oslo[data$r7P2==5]<-0
data$oslo[data$r7P2==6]<-0
data$east[data$r7P2==2]<-1
data$east[data$r7P2==1]<-0
data$east[data$r7P2==3]<-0
data$east[data$r7P2==4]<-0
data$east[data$r7P2==5]<-0
data$east[data$r7P2==6]<-0
data$south[data$r7P2==3]<-1
data$south[data$r7P2==1]<-0
data$south[data$r7P2==2]<-0
data$south[data$r7P2==4]<-0
data$south[data$r7P2==5]<-0
data$south[data$r7P2==6]<-0
data$west[data$r7P2==4]<-1
data$west[data$r7P2==1]<-0
data$west[data$r7P2==2]<-0
data$west[data$r7P2==3]<-0
data$west[data$r7P2==5]<-0
data$west[data$r7P2==6]<-0
data$middle[data$r7P2==5]<-1
data$middle[data$r7P2==1]<-0
data$middle[data$r7P2==2]<-0
data$middle[data$r7P2==3]<-0
data$middle[data$r7P2==4]<-0
data$middle[data$r7P2==6]<-0
data$north[data$r7P2==6]<-1
data$north[data$r7P2==1]<-0
data$north[data$r7P2==2]<-0
data$north[data$r7P2==3]<-0
data$north[data$r7P2==4]<-0
data$north[data$r7P2==5]<-0
data$highed[data$r7P4_1==3]<-1
data$highed[data$r7P4_1==1]<-0
data$highed[data$r7P4_1==2]<-0

# creating different datasets with different background variables included, and removing missing
data<-na.omit(subset(data, select=c(responseid, openanswer, treatment, oslo, north, south, east, west, middle, female, urban, age)))

# store the original in a separate vector before running substituion list
data$openanswer.orig <- data$openanswer

# load and run the substitution list 
# thanks to https://stackoverflow.com/questions/22888646/making-gsub-only-replace-entire-words
setwd("P:/2017-pathways/2-data") # The substitution list is the '2-data' folder
subs <- read.csv(file="2017-11-29-substitution_list_no.csv", header=TRUE, sep=";", stringsAsFactors = FALSE)
colnames(subs) <- c("orig", "replace")
text <- tolower(data$openanswer)
for(i in 1:length(text)) {
  text[i] <- tolower(text[i])
  for(j in 1:length(subs$orig)) {
    text[[i]] <- gsub(subs$orig[j], subs$replace[j], text[[i]], ignore.case=TRUE)
  }}

# set openanswer=text
data$openanswer <- text

# stemming/stopword removal
processed <- textProcessor(data$openanswer, metadata=data, language="norwegian", verbose=TRUE)

# structure and index for usage in the stm model. Verify no-missingness.
o_p_data <- prepDocuments(processed$documents, processed$vocab, processed$meta,
                     lower.thresh=3)
docs <- out$documents
vocab <- out$vocab
meta_o_p_data <-out$meta

# run stm model
pathwaysPrevFit9 <- stm(out$documents,
                        out$vocab,
                        K=9,
                        init.type="Spectral",
                        prevalence =~ treatment+female+oslo+west+east+south+middle+north+age, 
                        max.em.its=500, 
                        data=out$meta)
setwd("P:/2017-pathways/new/4-model-output")
save(o_p_data, meta_o_p_data, pathwaysPrevFit9, file="pathwaysPrevFit9.Rdata")


